## Ryan Copus and Ryan Hübert
## Measuring How Much Judges Matter for Case Outcomes
## Journal of Law and Courts

# Note to reader: please review the README for important information about 
# software and other requirements

################################################################################
# IMPORTANT NOTE ABOUT HARDWARE COMPATIBILITY
################################################################################

# The XGBoost algorithm is used in the following prediction exercise. However, 
# as of March 2025, h2o.xgboost() is not an available algorithm on Windows or 
# some Macs with silicon chips. The code below will detect whether the 
# h2o.xgboost() algorithm is available, and if not, will use R's `xgboost` 
# package instead. This means results may vary slightly across machines.

################################################################################
# SCRIPT-SPECIFIC USER-SPECIFIED VARIABLES
################################################################################

# Do you want to load partial results (if available) and resume estimation?
# If so, please set `resume <- TRUE` below
# Note: the exists() test is checking whether this script is being run passively 
#       via the 0.FullReplication.R script, which sets its own toggle for resume
if(!exists("resume")) {
  # Clear workspace (comment out if you do not want to do this for some reason)
  rm(list=ls())
  # Do you want to load partial results (if available) and resume estimation?
  resume <- FALSE # if FALSE then any existing results will be over-written
  # How much memory of your computer do you want to use to run h2o models?
  mem <- 64 # this is measured in GB
  # How many cores of your computer do you want to use to run h2o models?
  ncores <- parallel::detectCores() # this will use use all cores
} else {
  # If this script is being executed passively via the 0.FullReplication.R script
  message(paste0("Note: this script will ", 
                 ifelse(resume, "resume partially completed estimation.", 
                        "run the entire script from scratch.")))
  Sys.sleep(5)
}

################################################################################
# LOAD UTILITIES, PACKAGES AND DATA
################################################################################

# Load utils and user-supplied variables
source(here::here("Code/_config.R"))
source(here::here("Code/_utils.R"))

# Load libraries
library("h2o") # used to run machine learning models
library("xgboost") # Some machines cannot run XGBoost in h2o & will need to use `xgboost` package

# Load dataset and lists of predictors
load("Data/9C_DATA.rdata")
load("Data/PREDICTORS.rdata")

################################################################################
# CLEAN TRAINING DATA FOR PREDICTION
################################################################################

# Set the outcome and predictors we will use below
y <- "negative"
x <- c(fes, app.predictors, dist.predictors)

# Set the number of "internal" folds for cross-validation inside h2o algorithms
nfolds <- 10

# Set the "external" folds
set.seed(15) # set seed to ensure same folds every time script is executed
df$fold <- sample(rep_len(1:10, nrow(df))) # we have 10 external folds

# Create a temporary dataset for prediction
jf <- df[, c("row.id", y, "fold", fes, app.predictors, dist.predictors)]

# Drop low values to improve prediction (set minimum below)
minimum <- 200 # can alter this, but may get slightly different results
for(i in names(jf)){
  if(class(jf[, i]) == "factor" | class(jf[, i]) == "character"){
    jf[, i] <- factor(jf[, i])
    low <- names(which(table(jf[, i]) < minimum))
    levels(jf[,i])[levels(jf[,i]) %in% low] <- "Other"
    levels(jf[,i])[levels(jf[,i]) == "OTHER"] <- "Other"
  }
}

# h2o requires response variable to be a factor variable
jf$negative <- factor(jf$negative)

# Data that we will collect below by replacing NAs
pf <- as.data.frame(df[,c("row.id", "fold", "negative")])
pf$expected <- NA
pf$deviance <- NA
rm(df)

# If resume==TRUE, check if there are partial results and load them
new.file <- "Data/deviance.rdata"
if(resume){
  if(!file.exists(new.file)){
    stop(paste0("The data file ", new.file, " does not exist! You must set resume=FALSE or check your working directory is correct."))
  }
  load(new.file)
}

################################################################################
# FIT MODELS TO ESTIMATE JUDGE-FREE PREDICTIONS (WHAT WE CALL "DEVIANCE")
################################################################################

# Keep track of any errors that arise during the estimation
error.counter <- 0

# Keep track of warnings raised in each fold:
fold.warnings <- c()

# Keep any object we've assigned so far
keep.obj <- c(ls(), "keep.obj")

# Iterate over the folds as long as there are missing values
while(length(GetMissingFolds(pf, 'expected', any.NA = FALSE)) > 0 & error.counter <= 5){

  tryCatch({

    # Randomly select a fold to run
    fold <- as.numeric(names(sample(GetMissingFolds(pf, 'expected', any.NA = FALSE))[1]))

    # Connect to the h2o server
    h2o.init(max_mem_size = paste0(mem,"g"))

    # Check if h2o's XGBoost algorithm is available on this machine
    xgb.test <- h2o.xgboost.available()

    message("\n\nNow beginning estimation for fold ", fold, "...")

    # Set a specific feed for each fold
    set.seed(15 + fold) # this is probably not needed, but we set just in case

    # Split dataset into training and hold out sets
    dfp <- jf[jf$fold != fold, ] # training
    dfe <- jf[jf$fold == fold, ] # hold out

    # Convert dataframes to h2o objects
    hdfp <- as.h2o(dfp)
    hdfe <- as.h2o(dfe)

    # Fit a set of models
    message("  >> fitting first GBM...")
    my_gbm2 <- h2o.gbm(x = x,
                       y = y,
                       training_frame = hdfp,
                       model_id = paste0("gbm2", fold, "court"),
                       nfolds = nfolds,
                       fold_assignment = "Modulo",
                       keep_cross_validation_predictions = TRUE,
                       distribution = "bernoulli",
                       ntrees = 1000,
                       max_depth = 2,
                       min_rows = 2,
                       learn_rate = 0.1,
                       seed = 1, score_tree_interval = 10,
                       col_sample_rate = 0.8, sample_rate = 0.8)

    message("  >> fitting second GBM...")
    my_gbm3 <- h2o.gbm(x = x,
                       y = y,
                       training_frame = hdfp,
                       model_id = paste0("gbm3", fold, "court"),
                       nfolds = nfolds,
                       fold_assignment = "Modulo",
                       keep_cross_validation_predictions = TRUE,
                       distribution = "bernoulli",
                       ntrees = 1000,
                       max_depth = 3,
                       min_rows = 2,
                       learn_rate = 0.1,
                       seed = 1)

    message("  >> fitting a LASSO regression...")
    my_lasso <- glmWrapper(paste0("lasso", fold, "court"), x, y, hdfp, nfolds, alpha.val = 1, family = "binomial")
    
    message("  >> fitting a Ridge regression...")
    my_ridge <- glmWrapper(paste0("ridge", fold, "court"), x, y, hdfp, nfolds, alpha.val = 0, family = "binomial")

    message("  >> fitting a Random Forest...")
    my_rf <- h2o.randomForest(x = x,
                              y = y,
                              training_frame = hdfp,
                              model_id = paste0("rf", fold, "court"),
                              nfolds = nfolds,
                              fold_assignment = "Modulo",
                              keep_cross_validation_predictions = TRUE,
                              binomial_double_trees = TRUE,
                              ntrees = 300, # Changed from 1000 since very slow
                              min_rows = 2,
                              seed = 1)

    if(xgb.test){
      ## XGBoost is available in your h2o installation, so run it

      message("  >> fitting an XGBoost (using h2o)...")
      my_xgb <- h2o.xgboost(x = x,
                            y = y,
                            training_frame = hdfp,
                            model_id = paste0("xgb", fold, "court"),
                            nfolds = nfolds,
                            fold_assignment = "Modulo",
                            keep_cross_validation_predictions = TRUE,
                            ntrees = 1000,
                            max_depth = 4,
                            learn_rate = 0.01, # shrinkage
                            seed = 1)

      message("  >> fitting an stacked ensemble...")

      # Set list of base models
      bm <- list(my_gbm2,
                 my_gbm3,
                 my_lasso,
                 my_xgb,
                 my_rf,
                 my_ridge)

      # Train the ensemble
      temp_ens <- h2o.stackedEnsemble(x = x,
                                      y = y,
                                      training_frame = hdfp[, c(x,y)],
                                      model_id = paste0("Exp_ensemble_", fold),
                                      metalearner_algorithm = "AUTO",
                                      metalearner_nfolds = 10,
                                      base_models = bm,
                                      seed = 1)

      # Create panel-free predictions
      dfe_preds <- h2o.predict(temp_ens, newdata = hdfe)[3]
      dfe_preds <- as.data.frame(dfe_preds)[, 1]
      pf$expected[pf$fold == fold] <- dfe_preds

    } else {
      ## XGBoost is not available in your h2o installation, so use R `xgboost`
      ## Note: results may be different, and run-time is substantially longer

      message("  >> fitting an XGBoost (using xgboost)...")
      message("     Note: this will take a while, no progress bar will be visible")

      # Create train/test design matrices (model.matrix creates dummies from factors)
      train_matrix <- model.matrix(~ . - 1, data = dfp[, x])
      test_matrix <- model.matrix(~ . - 1, data = dfe[, x]) # Create a design matrix

      # Convert design matrices to `DMatrix` objects for use in XGBoost
      dtrain <- xgb.DMatrix(data = train_matrix, label = as.numeric(as.character(dfp[[y]])))
      dtest <- xgb.DMatrix(data = test_matrix)

      # Set XGBoost parameters
      xgb_params <- list(
        objective = "binary:logistic",
        max_depth = 4,
        eta = 0.01,  # Shrinkage
        eval_metric = "logloss",
        nthread = ncores  # Use cores as set in _config.R; Note--does not appear to do anything
      )

      # Train to get cross validation predictions
      set.seed(217+fold)
      cv_results <- xgb.cv(
        params = xgb_params,
        data = dtrain,
        nrounds = 1000,
        nfold = nfolds,  #
        prediction = TRUE,  # Store out-of-fold predictions
        verbose = FALSE
      )

      # Train a final model on the full dataset to make test predictions
      my_xgb <- xgb.train(
        params = xgb_params,
        data = dtrain,
        nrounds = 1000,
        verbose = FALSE
      )

      # Get the cv predictions and the validation set predictions
      tr <- getPreds(my_gbm2, hdfe)
      tr <- getPreds(my_gbm3, hdfe, tr)
      tr <- getPreds(my_lasso, hdfe, tr)
      tr <- getPreds(my_ridge, hdfe, tr)
      tr <- getPreds(my_rf, hdfe, tr)
      tr <- getPreds(my_xgb, dtest, tr, model_cv_only = cv_results)

      tr[["cv.preds"]] <- cbind(tr[["cv.preds"]], dfp[,y,drop = FALSE])
      tr[["val.preds"]] <- cbind(tr[["val.preds"]], dfe[,y,drop = FALSE])

      message("  >> fitting an stacked ensemble...")

      # Train the ensemble
      temp_ens <- h2o.glm(x = colnames(tr[["cv.preds"]] )[1:ncol(tr[["cv.preds"]] )-1],
                          y = y,
                          training_frame = as.h2o(tr[["cv.preds"]]),
                          model_id = paste0("Exp_ensemble_", fold),
                          nfolds = nfolds,
                          fold_assignment = "Modulo",
                          keep_cross_validation_predictions = TRUE,
                          family = "binomial",
                          non_negative = TRUE,
                          seed = 1)

      # Create panel-free predictions
      dfe_preds <- h2o.predict(temp_ens, newdata = as.h2o(tr[["val.preds"]]))[3]
      dfe_preds <- as.data.frame(dfe_preds)[, 1]
      pf$expected[pf$fold == fold] <- dfe_preds

    }

    # Save accumulated predictions for this fold (and previous folds)
    save(pf, file=new.file)

    # Collect any warnings
    fold.warnings <- unique(c(fold.warnings, paste0("fold ", fold, ": ", names(warnings()))))
    print(fold.warnings)

    # Remove unneeded objects and shut down the h2o server
    rm(list = setdiff(ls(), keep.obj))
    h2o.removeAll()
    h2o.shutdown(prompt=FALSE)

    # Reset the error counter after completing iteration with no error
    error.counter <- 0

    # h2o server seems to have problems if you shut down and restart too quickly
    Sys.sleep(30)

  },
  error = function(e) {
    message("Error: ", e$message)
    error.counter <- error.counter + 1

    # Remove unneeded objects and shut down the h2o server
    rm(list = setdiff(ls(), keep.obj))
    h2o.removeAll()
    h2o.shutdown(prompt=FALSE)

    # h2o server seems to have problems if you shut down and restart too quickly
    Sys.sleep(120) # shut down for 2 minutes
  })

}

fold.warnings <- fold.warnings[!grepl("Please download and install the latest version from",fold.warnings)]
fold.warnings <- fold.warnings[!grepl("^fold [0-9]+[:] *$",fold.warnings)]
print(fold.warnings)
if(length(fold.warnings) > 0){
  readr::write_file(paste0(fold.warnings,collapse = "\n"), file = "Data/warnings1.log")
}

# If we have successfully fitted algorithms for all folds, then do final steps
if(length(GetMissingFolds(pf, 'expected', any.NA = FALSE)) == 0){

  # Create the deviance variable for the whole dataset
  pf$deviance <- pf$negative - pf$expected

  # Save the final results as RData and csv
  save(pf, file=new.file)
  readr::write_csv(pf, file=gsub(".rdata", ".csv", new.file)) # also save copy as csv
}
